{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import tensorflow as tf\r\n",
    "import numpy as np\r\n",
    "import os\r\n",
    "import pickle\r\n",
    "\r\n",
    "SEQUENCE_LENGTH = 50\r\n",
    "EMBEDDING_DIM = 200\r\n",
    "BATCH_SIZE = 128\r\n",
    "FILE_PATH = \"data/python_code.py\"\r\n",
    "BASENAME = os.path.basename(FILE_PATH) + \"-lower\"\r\n",
    "\r\n",
    "text = open(FILE_PATH).read()\r\n",
    "# comment this if you want to use uppercase letters\r\n",
    "text = text.lower()\r\n",
    "n_chars = len(text)\r\n",
    "vocab = ''.join(sorted(set(text)))\r\n",
    "print(\"vocab:\", vocab)\r\n",
    "n_unique_chars = len(vocab)\r\n",
    "print(\"Number of characters:\", n_chars)\r\n",
    "print(\"Number of unique characters:\", n_unique_chars)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# dictionary that converts characters to integers\r\n",
    "char2int = {c: i for i, c in enumerate(vocab)}\r\n",
    "# dictionary that converts integers to characters\r\n",
    "int2char = {i: c for i, c in enumerate(vocab)}\r\n",
    "\r\n",
    "# save these dictionaries for later generation\r\n",
    "pickle.dump(char2int, open(f\"{BASENAME}-char2int.pickle\", \"wb\"))\r\n",
    "pickle.dump(int2char, open(f\"{BASENAME}-int2char.pickle\", \"wb\"))"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "encoded_text = np.array([char2int[c] for c in text])"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)\r\n",
    "for element in char_dataset.take(5):\r\n",
    "    print(element.numpy())"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "for element in char_dataset.batch(SEQUENCE_LENGTH+1).shuffle(1024).take(2):\r\n",
    "    print(''.join([int2char[c] for c in element.numpy()]))"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "#help(tf.one_hot)\r\n",
    "#help(char_dataset.window)\r\n",
    "windows = char_dataset.window(SEQUENCE_LENGTH+1, shift=1, drop_remainder=True)\r\n",
    "sequences = windows.flat_map(lambda window: window.batch(SEQUENCE_LENGTH+1))\r\n",
    "dataset = sequences.map(lambda x: (x[:-1], x[-1]))\r\n",
    "for input_, target in dataset.take(10):\r\n",
    "    print(input_.numpy().shape)\r\n",
    "    print(target.numpy().shape)\r\n",
    "    print(''.join([int2char[c] for c in input_.numpy()]), int2char[target.numpy()])\r\n",
    "    print(\"=\"*50)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "sequences2 = char_dataset.batch(2*SEQUENCE_LENGTH+1, drop_remainder=True)\r\n",
    "\r\n",
    "def split_sample(sample):\r\n",
    "    ds = tf.data.Dataset.from_tensors((sample[:SEQUENCE_LENGTH], sample[SEQUENCE_LENGTH]))\r\n",
    "    for i in range(1, (len(sample)-1) // 2):\r\n",
    "        input_ = sample[i:i+SEQUENCE_LENGTH]\r\n",
    "        target = sample[i+SEQUENCE_LENGTH]\r\n",
    "        other_ds = tf.data.Dataset.from_tensors((input_, target))\r\n",
    "        ds = ds.concatenate(other_ds)\r\n",
    "    return ds\r\n",
    "\r\n",
    "\r\n",
    "dataset2 = sequences2.flat_map(split_sample)\r\n",
    "for element in dataset2.take(10):\r\n",
    "    print(element[0].shape, element[1].shape)\r\n",
    "    print(''.join([int2char[c] for c in element[0].numpy()]), int2char[element[1].numpy()])"
   ],
   "outputs": [],
   "metadata": {
    "tags": [
     "outputPrepend",
     "outputPrepend",
     "outputPrepend",
     "outputPrepend"
    ]
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "for element1, element2 in zip(dataset.take(5), dataset2.take(5)):\r\n",
    "    print(element1[0].numpy() == element2[0].numpy())\r\n",
    "    "
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def one_hot_samples(input_, target):\r\n",
    "    return tf.one_hot(input_, len(vocab)), tf.one_hot(target, len(vocab))\r\n",
    "#     return input_, tf.one_hot(target, len(vocab))\r\n",
    "\r\n",
    "dataset = dataset.map(one_hot_samples)\r\n",
    "dataset2 = dataset2.map(one_hot_samples)\r\n",
    "for element in dataset.take(10):\r\n",
    "    print(element[0].shape, element[1].shape)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "ds = dataset.shuffle(1024).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(1).repeat()\r\n",
    "ds2 = dataset2.shuffle(1024).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(1).repeat()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def create_model(vocab_size, embedding_dim, rnn_units, batch_size):\r\n",
    "    model = tf.keras.Sequential()\r\n",
    "    # model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_shape=(SEQUENCE_LENGTH,)))\r\n",
    "    model.add(tf.keras.layers.LSTM(rnn_units, input_shape=(SEQUENCE_LENGTH, len(vocab)), return_sequences=True))\r\n",
    "    model.add(tf.keras.layers.Dropout(0.3))\r\n",
    "    model.add(tf.keras.layers.LSTM(rnn_units)),\r\n",
    "    model.add(tf.keras.layers.Dropout(0.3))\r\n",
    "    model.add(tf.keras.layers.Dense(vocab_size, activation=\"softmax\"))\r\n",
    "    return model"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "model = create_model(len(vocab), embedding_dim=EMBEDDING_DIM, rnn_units=128, batch_size=BATCH_SIZE)\r\n",
    "model.summary()\r\n",
    "model.compile(optimizer=\"adam\", loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "EPOCHS = 5\r\n",
    "history = model.fit(ds2, steps_per_epoch=(len(encoded_text) - SEQUENCE_LENGTH ) // BATCH_SIZE, epochs=EPOCHS)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# save the model\r\n",
    "model_path = f\"results/{BASENAME}-{SEQUENCE_LENGTH}-NOEMBEDDING-moredata.h5\"\r\n",
    "model.save(model_path)\r\n",
    "# model.load_weights(model_path)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "seed = \"\"\"You can be a\"\"\".lower()\r\n",
    "s = seed\r\n",
    "# generate 400 characters\r\n",
    "generated = \"\"\r\n",
    "for i in range(200):\r\n",
    "    # make the input sequence\r\n",
    "    X = np.zeros((1, SEQUENCE_LENGTH, len(vocab)))\r\n",
    "    # X = np.zeros((1, SEQUENCE_LENGTH))\r\n",
    "    for t, char in enumerate(seed):\r\n",
    "        X[0, (SEQUENCE_LENGTH - len(seed)) + t, char2int[char]] = 1\r\n",
    "    # predict the next character\r\n",
    "    predicted = model.predict(X, verbose=0)[0]\r\n",
    "    # print(predicted)\r\n",
    "    # converting the vector to an integer\r\n",
    "    next_index = np.argmax(predicted)\r\n",
    "#     next_index = np.squeeze(np.round(predicted))\r\n",
    "    # converting the integer to a character\r\n",
    "#     print(next_index)\r\n",
    "    next_char = int2char[next_index]\r\n",
    "    # add the character to results\r\n",
    "    generated += next_char\r\n",
    "    # shift seed and the predicted character\r\n",
    "    seed = seed[1:] + next_char\r\n",
    "\r\n",
    "print(\"Generated text:\")\r\n",
    "print(s + generated)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "char2int\r\n"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "file_extension": ".py",
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.8.7 64-bit"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.7"
  },
  "mimetype": "text/x-python",
  "name": "python",
  "npconvert_exporter": "python",
  "pygments_lexer": "ipython3",
  "version": 3,
  "interpreter": {
   "hash": "777490da48e046e3b512f0b24bf037db286a787493a11bf82a9e0f2cbf21bb67"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}